library(here)
library(tidyverse)
library(conflicted)
# library(easystats)

exoplanets <- read_csv(here("data", "exoplanet_catalog_080325.csv"))
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
  dat <- vroom(...)
  problems(dat)Rows: 7418 Columns: 98── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (12): name, planet_status, publication, detection_type, mass_measurement_type, radius_measurement_type,...
dbl  (83): mass, mass_error_min, mass_error_max, mass_sini, mass_sini_error_min, mass_sini_error_max, radius...
lgl   (2): hot_point_lon, star_magnetic_field
date  (1): updated
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
exoplanets
library(skimr)
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
skim(exoplanets)
Warning: There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names), mangled_skimmers$funs)`.
ℹ In group 0: .
Caused by warning:
! There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names), mangled_skimmers$funs)`.
Caused by warning in `inline_hist()`:
! Variable contains Inf or -Inf value(s) that were converted to NA.
── Data Summary ────────────────────────
                           Values    
Name                       exoplanets
Number of rows             7418      
Number of columns          98        
_______________________              
Column type frequency:               
  character                12        
  Date                     1         
  logical                  2         
  numeric                  83        
________________________             
Group variables            None      
library(naniar)
# options(repr.plot.width = 10, repr.plot.height = 20)
gg_miss_var(exoplanets)

library(visdat)
# options(repr.plot.width = 20, repr.plot.height = 10)
vis_dat(exoplanets)

names(exoplanets)
 [1] "name"                       "planet_status"              "mass"                      
 [4] "mass_error_min"             "mass_error_max"             "mass_sini"                 
 [7] "mass_sini_error_min"        "mass_sini_error_max"        "radius"                    
[10] "radius_error_min"           "radius_error_max"           "orbital_period"            
[13] "orbital_period_error_min"   "orbital_period_error_max"   "semi_major_axis"           
[16] "semi_major_axis_error_min"  "semi_major_axis_error_max"  "eccentricity"              
[19] "eccentricity_error_min"     "eccentricity_error_max"     "inclination"               
[22] "inclination_error_min"      "inclination_error_max"      "angular_distance"          
[25] "discovered"                 "updated"                    "omega"                     
[28] "omega_error_min"            "omega_error_max"            "tperi"                     
[31] "tperi_error_min"            "tperi_error_max"            "tconj"                     
[34] "tconj_error_min"            "tconj_error_max"            "tzero_tr"                  
[37] "tzero_tr_error_min"         "tzero_tr_error_max"         "tzero_tr_sec"              
[40] "tzero_tr_sec_error_min"     "tzero_tr_sec_error_max"     "lambda_angle"              
[43] "lambda_angle_error_min"     "lambda_angle_error_max"     "impact_parameter"          
[46] "impact_parameter_error_min" "impact_parameter_error_max" "tzero_vr"                  
[49] "tzero_vr_error_min"         "tzero_vr_error_max"         "k"                         
[52] "k_error_min"                "k_error_max"                "temp_calculated"           
[55] "temp_calculated_error_min"  "temp_calculated_error_max"  "temp_measured"             
[58] "hot_point_lon"              "geometric_albedo"           "geometric_albedo_error_min"
[61] "geometric_albedo_error_max" "log_g"                      "publication"               
[64] "detection_type"             "mass_measurement_type"      "radius_measurement_type"   
[67] "alternate_names"            "molecules"                  "star_name"                 
[70] "ra"                         "dec"                        "mag_v"                     
[73] "mag_i"                      "mag_j"                      "mag_h"                     
[76] "mag_k"                      "star_distance"              "star_distance_error_min"   
[79] "star_distance_error_max"    "star_metallicity"           "star_metallicity_error_min"
[82] "star_metallicity_error_max" "star_mass"                  "star_mass_error_min"       
[85] "star_mass_error_max"        "star_radius"                "star_radius_error_min"     
[88] "star_radius_error_max"      "star_sp_type"               "star_age"                  
[91] "star_age_error_min"         "star_age_error_max"         "star_teff"                 
[94] "star_teff_error_min"        "star_teff_error_max"        "star_detected_disc"        
[97] "star_magnetic_field"        "star_alternate_names"      
library(janitor)
exoplanets %>% tabyl(planet_status)
 planet_status    n percent
     Confirmed 7418       1
library(data.table)
# options(repr.matrix.max.rows=100)
exoplanets %>% 
  add_prop_miss() %>%
  arrange(prop_miss_all) %>% 
  head(5) %>% 
  data.table::transpose(keep.names="column") -> preview

preview
preview %>% View()

We have a lot of features: - Planet name - Mass (M jup) - Mass*sin(i) (M jup) - This describes minimum mass of the planet due to inclination effect

exoplanets %>% 
  filter(name %>% str_like("%TOI-784%"))
conflicts_prefer(dplyr::filter)
[conflicted] Removing existing preference.[conflicted] Will prefer dplyr::filter over any other package.
exoplanets %>% 
  filter(discovered == 2023)
# remove any column with error in the name
exoplanets %>% 
  select(-contains("error")) %>% 
  select(-planet_status, -updated)
exoplanets %>% 
  tabyl("detection_type")
                       detection_type    n      percent
                           Astrometry   46 0.0062011324
                  Astrometry, Imaging    1 0.0001348072
          Astrometry, Radial Velocity    3 0.0004044217
                              Imaging  922 0.1242922621
                  Imaging, Astrometry   49 0.0066055541
                   Imaging, Kinematic    2 0.0002696145
                       Imaging, Other   46 0.0062011324
           Imaging, Other, Astrometry    1 0.0001348072
            Imaging, Other, Kinematic    3 0.0004044217
             Imaging, Primary Transit    1 0.0001348072
 Imaging, Radial Velocity, Astrometry    1 0.0001348072
                            Kinematic    2 0.0002696145
                         Microlensing  313 0.0421946616
                                Other   42 0.0056619035
                       Other, Imaging    1 0.0001348072
            Other, Imaging, Kinematic    1 0.0001348072
               Other, Radial Velocity    1 0.0001348072
                      Primary Transit 4509 0.6078457805
          Primary Transit, Astrometry    1 0.0001348072
           Primary Transit, Kinematic    1 0.0001348072
     Primary Transit, Radial Velocity    7 0.0009436506
                 Primary Transit, TTV    2 0.0002696145
                      Radial Velocity 1145 0.1543542734
          Radial Velocity, Astrometry   99 0.0133459153
             Radial Velocity, Imaging    2 0.0002696145
     Radial Velocity, Primary Transit    7 0.0009436506
              Radial Velocity, Timing    1 0.0001348072
                                  TTV   32 0.0043138312
                               Timing  160 0.0215691561
                   Timing, Astrometry    1 0.0001348072
                    Timing, Kinematic   10 0.0013480723
                        Timing, Other    6 0.0008088434
exoplanets
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCmxpYnJhcnkoaGVyZSkNCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShjb25mbGljdGVkKQ0KIyBsaWJyYXJ5KGVhc3lzdGF0cykNCg0KZXhvcGxhbmV0cyA8LSByZWFkX2NzdihoZXJlKCJkYXRhIiwgImV4b3BsYW5ldF9jYXRhbG9nXzA4MDMyNS5jc3YiKSkNCmV4b3BsYW5ldHMNCmBgYA0KDQoNCmBgYHtyfQ0KbGlicmFyeShza2ltcikNCg0Kc2tpbShleG9wbGFuZXRzKQ0KYGBgDQoNCg0KYGBge3J9DQpsaWJyYXJ5KG5hbmlhcikNCiMgb3B0aW9ucyhyZXByLnBsb3Qud2lkdGggPSAxMCwgcmVwci5wbG90LmhlaWdodCA9IDIwKQ0KZ2dfbWlzc192YXIoZXhvcGxhbmV0cykNCmBgYA0KDQoNCmBgYHtyfQ0KbGlicmFyeSh2aXNkYXQpDQojIG9wdGlvbnMocmVwci5wbG90LndpZHRoID0gMjAsIHJlcHIucGxvdC5oZWlnaHQgPSAxMCkNCnZpc19kYXQoZXhvcGxhbmV0cykNCmBgYA0KDQoNCmBgYHtyfQ0KbmFtZXMoZXhvcGxhbmV0cykNCmBgYA0KDQoNCmBgYHtyfQ0KbGlicmFyeShqYW5pdG9yKQ0KZXhvcGxhbmV0cyAlPiUgdGFieWwocGxhbmV0X3N0YXR1cykNCmBgYA0KDQoNCmBgYHtyfQ0KbGlicmFyeShkYXRhLnRhYmxlKQ0KIyBvcHRpb25zKHJlcHIubWF0cml4Lm1heC5yb3dzPTEwMCkNCmV4b3BsYW5ldHMgJT4lIA0KICBhZGRfcHJvcF9taXNzKCkgJT4lDQogIGFycmFuZ2UocHJvcF9taXNzX2FsbCkgJT4lIA0KICBoZWFkKDUpICU+JSANCiAgZGF0YS50YWJsZTo6dHJhbnNwb3NlKGtlZXAubmFtZXM9ImNvbHVtbiIpIC0+IHByZXZpZXcNCg0KcHJldmlldw0KcHJldmlldyAlPiUgVmlldygpDQoNCmBgYA0KDQpXZSBoYXZlIGEgbG90IG9mIGZlYXR1cmVzOg0KLSBQbGFuZXQgbmFtZQ0KLSBNYXNzIChNIGp1cCkNCi0gTWFzcypzaW4oaSkgKE0ganVwKQ0KICAtIFRoaXMgZGVzY3JpYmVzIG1pbmltdW0gbWFzcyBvZiB0aGUgcGxhbmV0IGR1ZSB0byBpbmNsaW5hdGlvbiBlZmZlY3QNCg0KYGBge3J9DQpleG9wbGFuZXRzICU+JSANCiAgZmlsdGVyKG5hbWUgJT4lIHN0cl9saWtlKCIlVE9JLTc4NCUiKSkNCmBgYA0KDQoNCmBgYHtyfQ0KY29uZmxpY3RzX3ByZWZlcihkcGx5cjo6ZmlsdGVyKQ0KZXhvcGxhbmV0cyAlPiUgDQogIGZpbHRlcihkaXNjb3ZlcmVkID09IDIwMjMpDQpgYGANCg0KDQoNCmBgYHtyfQ0KIyByZW1vdmUgYW55IGNvbHVtbiB3aXRoIGVycm9yIGluIHRoZSBuYW1lDQpleG9wbGFuZXRzICU+JSANCiAgc2VsZWN0KC1jb250YWlucygiZXJyb3IiKSkgJT4lIA0KICBzZWxlY3QoLXBsYW5ldF9zdGF0dXMsIC11cGRhdGVkKQ0KDQpgYGANCg0KDQpgYGB7cn0NCmV4b3BsYW5ldHMgJT4lIA0KICB0YWJ5bCgiZGV0ZWN0aW9uX3R5cGUiKQ0KYGBgDQoNCmBgYHtyfQ0KZXhvcGxhbmV0cw0KYGBgDQoNCg==